rm(list = ls())
library(data.table)
library(foreign)
library(kernlab)
library(MASS) # for mvrnorm
library(ggplot2)
library(gridExtra)
library(glmnet)
library(caret)
library(memisc) #cases
setwd('~/github/bdr/')
source('functions.R')
# import data
data_sept18 = data.table(read.spss('data/Sept18/Sept18 public.sav', to.data.frame = T), stringsAsFactors = F)
Undeclared level(s) 2, 3, 4 added in variable: densityUndeclared level(s) 2, 3, 4 added in variable: sdensityUndeclared level(s) 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 added in variable: ageUndeclared level(s) 1, 2, 3, 4, 5, 6, 7 added in variable: hh1Undeclared level(s) 1, 2, 3, 4, 5, 6, 7 added in variable: hh3
data_sept18
data_june18 = data.table(read.spss('data/June18/June18 public.sav', to.data.frame = T), stringsAsFactors = F)
Undeclared level(s) 2, 3, 4 added in variable: densityUndeclared level(s) 2, 3, 4 added in variable: sdensityUndeclared level(s) 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94 added in variable: ageUndeclared level(s) 1, 2, 3, 4, 5, 6, 7 added in variable: hh1Undeclared level(s) 1, 2, 3, 4, 5, 6 added in variable: hh3
data_june18
data_may18 = data.table(read.spss('data/May18/May18 public.sav', to.data.frame = T), stringsAsFactors = F)
Undeclared level(s) 2, 3, 4 added in variable: densityUndeclared level(s) 2, 3, 4 added in variable: sdensityUndeclared level(s) 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 20, 24 added in variable: slepnhisUndeclared level(s) 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 94, 95 added in variable: ageUndeclared level(s) 1, 2, 3, 4, 5, 6, 7 added in variable: hh1Undeclared level(s) 1, 2, 3, 4, 5, 6, 7 added in variable: hh3
data_may18
# data_march18 = data.table(read.spss('data/March18/March18 public.sav', to.data.frame = T), stringsAsFactors = F)
# data_march18
###### PREP DATA
#specify which should be survey
survey_vars = c('demo_mode', 'demo_education', 'demo_phonetype', 'month_called', 'demo_ideology')
file_and_survey_vars = c('demo_sex', 'demo_age_bucket', 'demo_state', 'demo_income', 'demo_region', 'demo_race', 'demo_hispanic')
# covars = c('sample', 'int_date', 'sex', 'educ', 'hisp', 'racecmb', 'relig', 'income', 'party', 'hh1', 'ql1', 'sstate', 'sdensity')
#
# # check that covariates are in all data sets
# covars %in% names(data_june18)
# covars %in% names(data_sept18)
# covars %in% names(data_may18)
# covars %in% names(data_march18)
data_sept18
data_may18[, .N, party]
NAs introduced by coercionconditions are not mutually exclusivecondition is.na(age_num) is never satisfiedconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveAdding new column 'qsupport' then assigning NULL (deleting it).NAs introduced by coercionconditions are not mutually exclusivecondition is.na(age_num) is never satisfiedconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveAdding new column 'qsupport' then assigning NULL (deleting it).NAs introduced by coercionconditions are not mutually exclusivecondition is.na(age_num) is never satisfiedconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveconditions are not mutually exclusiveAdding new column 'qsupport' then assigning NULL (deleting it).
# create data table with vars and levels
covars = names(data_recoded)[grepl('demo', names(data_recoded))]
covars = data.table(do.call(rbind, lapply(covars, function(c){
cbind(c, data_recoded[, .(level = unique(get(c)))][order(level)])
})))
setnames(covars, c('var', 'level'))
covars[, level_modmat := paste0(var, level)]
covars[, in_survey := as.numeric(var %in% survey_vars)]
covars[, in_both := as.numeric(var %in% file_and_survey_vars)]
covars[, in_file := as.numeric(in_survey + in_both == 0)]
# scale age and set NAs to 0
data_recoded[, age_scaled := scale(age_num)]
data_recoded[is.na(age_scaled), age_scaled := 0]
data_recoded[, p_surveyed :=
(-2)
+ age_scaled
- 0.5 * is.na(age_num)
+ 1.5 * as.numeric(demo_mode == 'cell')
- 1.5 * as.numeric(demo_party == '05-Ind')
- 3 * as.numeric(demo_party == "99-DK/refused")
+ 1.5 * as.numeric(demo_education %in% c('01-postgrad', '02-bach'))
+ 3 * as.numeric(demo_ideology == 'Very conservative' | demo_ideology == 'Very liberal')
]
data_recoded[, p_surveyed := exp(p_surveyed)/(1 + exp(p_surveyed))]
hist(data_recoded[, p_surveyed])
data_recoded[, .(.N, mean(p_surveyed)), .(demo_age_bucket)][order(demo_age_bucket)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_mode)][order(demo_mode)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_party)][order(demo_party)]
data_recoded[, .(.N, mean(p_surveyed)), .(demo_ideology)][order(demo_ideology)]
data_recoded[, p_matched := NULL]
Adding new column 'p_matched' then assigning NULL (deleting it).
data_recoded[, p_matched :=
-2 +
-2 * as.numeric(demo_mode == 'landline')
+ 3 * as.numeric(demo_race == 'W')
+ -2 * as.numeric(demo_reg == '03-No')
+ -1 * as.numeric(demo_hhsize == 2)
+ -2 * as.numeric(demo_hhsize == 3)
+ 0.5*age_scaled
+ as.numeric(demo_income)/3
- 4* as.numeric(demo_income == '99-DK/refused')
]
data_recoded[, p_matched := exp(p_matched)/(1 + exp(p_matched))]
hist(data_recoded$p_matched)
data_recoded[, .(.N, mean(p_matched)), demo_mode]
data_recoded[, .(.N, mean(p_matched)), demo_hispanic]
data_recoded[, .(.N, mean(p_matched)), demo_age_bucket][order(demo_age_bucket)]
testtrain = getTestTrain(data = data_recoded
, n_holdout = 1000, n_surveyed = 2000, n_matched = 1000
, p_surveyed = data_recoded$p_surveyed
, p_matched = data_recoded$p_matched
)
Adding new column 'holdout' then assigning NULL (deleting it).Adding new column 'surveyed' then assigning NULL (deleting it).Adding new column 'matched' then assigning NULL (deleting it).
data_recoded[, .N, list(holdout, surveyed, matched, voterfile)]
data_recoded = testtrain$data
data_recoded[, .(.N, prop_surveyed = mean(surveyed)), demo_age_bucket][order(demo_age_bucket)]
data_recoded[, .(.N, prop_surveyed = mean(surveyed)), demo_party][order(demo_party)]
data_recoded[, .(.N, mean(y_dem)), .(holdout, surveyed, matched)]
data_recoded[, mean(y_dem)]
[1] 0.4864043
landmarks = getLandmarks(data = data_recoded
, vars = unique(covars[in_both == 1 | in_file == 1,]$var)
, n_landmarks = 12
, subset_ind = (data_recoded$voterfile == 1))
X_file = landmarks$X[data_recoded$voterfile == 1, ]
n_landmarks
[1] 12
Choose scale parameter \(\sigma\) for RBF kernel - use median heuristic for now
sigma
[1] 1.30073e-05
Do basic Dist Reg
Hyperparameter tuning
names(fit_basicDR)
[1] "data" "fit" "landmarks" "bags" "y_hat" "mse_test"
param_nlandmarks
[1] 12 20 33 55 90 148 245 403 665 1097